{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c281e1af-1568-4bd8-8940-b1b9c89d12a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.cluster import KMeans\n",
    "import xgboost as xgb\n",
    "import shap\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63989f7b-a3ac-48bc-b912-c0f0e28389f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"LOKY_MAX_CPU_COUNT\"] = \"4\" "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "86de8b81-a867-4191-b69d-96ea0fb069a6",
   "metadata": {},
   "source": [
    "# 0. Map visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a0cb77ab-ad64-496d-93f8-5f225b031e7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")\n",
    "gdf_base = gpd.read_file(r\"\\TL_SCCO_SIG.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abda0407-7cd1-441d-a8b2-c35fe66e7c12",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge\n",
    "gdf = gdf_base.merge(df, left_on='SIG_KOR_NM', right_on='city and district')\n",
    "\n",
    "# Variables and display names\n",
    "map_vars = [\n",
    "    ('Heat_Index', 'HI'),\n",
    "    ('Environment_Stress_Index', 'ESI'),\n",
    "    ('WBGT_JME', 'WBGT'),\n",
    "    ('EVI', 'EVI'),\n",
    "    ('NDVI', 'NDVI'),\n",
    "    ('LAI', 'LAI'),\n",
    "    ('average_income', 'Average Income'),\n",
    "    ('realestate_tax', 'Real Estate Tax')\n",
    "]\n",
    "\n",
    "# Plot setup\n",
    "fig, axs = plt.subplots(2, 4, figsize=(24, 12))\n",
    "axs = axs.flatten()\n",
    "\n",
    "for i, (var, title) in enumerate(map_vars):\n",
    "    ax = axs[i]\n",
    "    gdf.plot(\n",
    "        column=var,\n",
    "        cmap='YlGnBu',\n",
    "        linewidth=0.3,\n",
    "        edgecolor='black',\n",
    "        ax=ax,\n",
    "        legend=True\n",
    "    )\n",
    "    ax.set_title(title, fontsize=30)\n",
    "    ax.axis('off')\n",
    "\n",
    "    # Resize legend label fonts\n",
    "    cbar = ax.get_figure().axes[-1]  # last axis is colorbar\n",
    "    for t in cbar.get_yticklabels():\n",
    "        t.set_fontsize(20)\n",
    "\n",
    "plt.tight_layout()\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c6b3b4b4-ddbb-4571-977c-604d235532c9",
   "metadata": {},
   "source": [
    "# 1. PCA results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bdd93ea-900c-4866-8820-28df440bcbdb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")\n",
    "gdf_base = gpd.read_file(r\"\\TL_SCCO_SIG.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fa04d09-cbeb-49eb-8a22-b49720c639b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "matplotlib.rcParams['axes.unicode_minus'] = False\n",
    "plt.rc(\"font\", family = 'Malgun Gothic')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "71fb33de-53aa-462a-bb21-453124ffdda9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reverse and standardize\n",
    "for var in ['average_income', 'realestate_tax', 'EVI', 'NDVI', 'LAI']:\n",
    "    df[var + '_inv'] = -1 * StandardScaler().fit_transform(df[[var]])\n",
    "\n",
    "# PCA\n",
    "df['heat_stress_score'] = PCA(n_components=1).fit_transform(\n",
    "    StandardScaler().fit_transform(df[['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']])\n",
    ")\n",
    "df['green_deficit_score'] = PCA(n_components=1).fit_transform(\n",
    "    df[['EVI_inv', 'NDVI_inv', 'LAI_inv']]\n",
    ")\n",
    "\n",
    "df_clean = df.dropna(subset=['average_income_inv', 'realestate_tax_inv', 'city and district'])\n",
    "df_clean['economic_disadvantage_score'] = PCA(n_components=1).fit_transform(\n",
    "    df_clean[['total_income_inv', 'realestate_tax_inv']]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7594975-45b8-4516-9fbf-7ab8aed774c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define original variable sets\n",
    "heat_vars = ['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']\n",
    "green_vars = ['EVI_inv', 'NDVI_inv', 'LAI_inv']\n",
    "econ_vars = ['average_income_inv', 'realestate_tax_inv']\n",
    "\n",
    "# Prepare scaled data\n",
    "heat_scaled = StandardScaler().fit_transform(df[heat_vars])\n",
    "green_scaled = df[green_vars].values\n",
    "econ_scaled = df_clean[econ_vars].values\n",
    "\n",
    "# Full PCA\n",
    "pca_heat_full = PCA().fit(heat_scaled)\n",
    "pca_green_full = PCA().fit(green_scaled)\n",
    "pca_econ_full = PCA().fit(econ_scaled)\n",
    "\n",
    "# Explained variance ratios\n",
    "heat_var_ratio = pca_heat_full.explained_variance_ratio_\n",
    "green_var_ratio = pca_green_full.explained_variance_ratio_\n",
    "econ_var_ratio = pca_econ_full.explained_variance_ratio_\n",
    "\n",
    "labels = [\"PC1\", \"PC2\", \"PC3\"]\n",
    "\n",
    "fig, axs = plt.subplots(1, 3, figsize=(18, 6))\n",
    "\n",
    "# Heat Stress\n",
    "axs[0].bar(labels[:len(heat_var_ratio)], heat_var_ratio, color='salmon')\n",
    "axs[0].set_title(\"(a) Heat Stress PCA Plot\", fontsize=23, pad=20)\n",
    "axs[0].set_ylabel(\"Explained Variance Ratio\", fontsize=16)\n",
    "axs[0].tick_params(axis='both', labelsize=15)\n",
    "axs[0].grid(True, linestyle='--', alpha=0.5)\n",
    "for i, v in enumerate(heat_var_ratio):\n",
    "    axs[0].text(i, v + 0.01, f\"{v:.2f}\", ha='center', fontsize=22)\n",
    "\n",
    "# Green Deficit\n",
    "axs[1].bar(labels[:len(green_var_ratio)], green_var_ratio, color='seagreen')\n",
    "axs[1].set_title(\"(b) Green Deficit PCA Plot\", fontsize=23, pad=20)\n",
    "axs[1].set_ylabel(\"Explained Variance Ratio\", fontsize=16)\n",
    "axs[1].tick_params(axis='both', labelsize=15)\n",
    "axs[1].grid(True, linestyle='--', alpha=0.5)\n",
    "for i, v in enumerate(green_var_ratio):\n",
    "    axs[1].text(i, v + 0.01, f\"{v:.2f}\", ha='center', fontsize=22)\n",
    "\n",
    "# Economic Disadvantage\n",
    "axs[2].bar(labels[:len(econ_var_ratio)], econ_var_ratio, color='steelblue')\n",
    "axs[2].set_title(\"(c) Economic Disadvantage PCA Plot\", fontsize=23, pad=20)\n",
    "axs[2].set_ylabel(\"Explained Variance Ratio\", fontsize=16)\n",
    "axs[2].tick_params(axis='both', labelsize=15)\n",
    "axs[2].grid(True, linestyle='--', alpha=0.5)\n",
    "for i, v in enumerate(econ_var_ratio):\n",
    "    axs[2].text(i, v + 0.01, f\"{v:.2f}\", ha='center', fontsize=22)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b04c00dd-8d16-46ef-8692-8b159571da46",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "e8f0f26c-6472-4c18-8cd9-adba32849619",
   "metadata": {},
   "source": [
    "# 2. K-mmeans++"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70d018f6-6023-44dd-ab7f-1d0c932f86f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")\n",
    "gdf_base = gpd.read_file(r\"\\TL_SCCO_SIG.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7aaf8b7c-09da-417c-8f8a-a71d07b3c14f",
   "metadata": {},
   "outputs": [],
   "source": [
    "matplotlib.rcParams['axes.unicode_minus'] = False\n",
    "plt.rc(\"font\", family = 'Malgun Gothic')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16bcff37-caa1-4c7b-bad6-e8ac825871c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reverse and standardize\n",
    "for var in ['average_income', 'realestate_tax', 'EVI', 'NDVI', 'LAI']:\n",
    "    df[var + '_inv'] = -1 * StandardScaler().fit_transform(df[[var]])\n",
    "\n",
    "#df['air_quality_z'] = StandardScaler().fit_transform(df[['air_quality']])\n",
    "#df['inequality_z'] = StandardScaler().fit_transform(df[['inequality']])\n",
    "\n",
    "# PCA\n",
    "df['heat_stress_score'] = PCA(n_components=1).fit_transform(\n",
    "    StandardScaler().fit_transform(df[['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']])\n",
    ")\n",
    "df['green_deficit_score'] = PCA(n_components=1).fit_transform(\n",
    "    df[['EVI_inv', 'NDVI_inv', 'LAI_inv']]\n",
    ")\n",
    "\n",
    "df_clean = df.dropna(subset=['total_income_inv', 'realestate_tax_inv', 'city and district'])\n",
    "df_clean['economic_disadvantage_score'] = PCA(n_components=1).fit_transform(\n",
    "    df_clean[['average_income_inv', 'realestate_tax_inv']]\n",
    ")\n",
    "\n",
    "# Final clustering variables\n",
    "cluster_vars = [\n",
    "    'heat_stress_score',\n",
    "    'green_deficit_score',\n",
    "\n",
    "    'economic_disadvantage_score',\n",
    "]\n",
    "X_features = df_clean[cluster_vars].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f26072f5-e6e8-403a-b8ea-ee7f6c0c5f16",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Re-run silhouette visualization and assign best cluster labels\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.metrics import silhouette_samples, silhouette_score\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.cm as cm\n",
    "\n",
    "cluster_range = [2, 3, 4, 5, 6, 7]\n",
    "best_k = None\n",
    "best_score = -1\n",
    "best_labels = None\n",
    "\n",
    "# Regenerate silhouette visualization with larger text and save to file\n",
    "fig, axs = plt.subplots(figsize=(5 * len(cluster_range), 5), nrows=1, ncols=len(cluster_range))\n",
    "\n",
    "for idx, n_clusters in enumerate(cluster_range):\n",
    "    kmeans = KMeans(n_clusters=n_clusters, max_iter=500, random_state=0)\n",
    "    labels = kmeans.fit_predict(X_features)\n",
    "    score = silhouette_score(X_features, labels)\n",
    "    sil_values = silhouette_samples(X_features, labels)\n",
    "\n",
    "    y_lower = 10\n",
    "    axs[idx].set_title(f'k={n_clusters}\\nSilhouette={round(score, 3)}', fontsize=35)\n",
    "    axs[idx].set_xlabel(\"Silhouette Coefficients\", fontsize=18)\n",
    "    axs[idx].set_xlim([-0.1, 1])\n",
    "    axs[idx].set_ylim([0, len(X_features) + (n_clusters + 1) * 10])\n",
    "    axs[idx].tick_params(labelsize=15)\n",
    "    axs[idx].set_yticks([])\n",
    "\n",
    "    for i in range(n_clusters):\n",
    "        ith_values = sil_values[labels == i]\n",
    "        ith_values.sort()\n",
    "        y_upper = y_lower + len(ith_values)\n",
    "        color = cm.nipy_spectral(float(i) / n_clusters)\n",
    "        axs[idx].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_values,\n",
    "                               facecolor=color, edgecolor=color, alpha=0.7)\n",
    "        axs[idx].text(-0.05, y_lower + 0.5 * len(ith_values), str(i), fontsize=12)\n",
    "        y_lower = y_upper + 10\n",
    "\n",
    "    axs[idx].axvline(x=score, color=\"red\", linestyle=\"--\")\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b4bb04c-1db8-47f6-a8f0-bd73c1151a64",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2bb649eb-ae35-40e4-b692-291ca9d6b135",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")\n",
    "gdf_base = gpd.read_file(r\"\\TL_SCCO_SIG.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4c5953f4-b008-4528-b901-32869e434e0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reverse and standardize\n",
    "for var in ['total_income', 'realestate_tax', 'EVI', 'NDVI', 'LAI']:\n",
    "    df[var + '_inv'] = -1 * StandardScaler().fit_transform(df[[var]])\n",
    "\n",
    "#df['air_quality_z'] = StandardScaler().fit_transform(df[['air_quality']])\n",
    "#df['inequality_z'] = StandardScaler().fit_transform(df[['inequality']])\n",
    "\n",
    "# PCA\n",
    "df['heat_stress_score'] = PCA(n_components=1).fit_transform(\n",
    "    StandardScaler().fit_transform(df[['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']])\n",
    ")\n",
    "df['green_deficit_score'] = PCA(n_components=1).fit_transform(\n",
    "    df[['EVI_inv', 'NDVI_inv', 'LAI_inv']]\n",
    ")\n",
    "\n",
    "df_clean = df.dropna(subset=['total_income_inv', 'realestate_tax_inv', 'city and district'])\n",
    "df_clean['economic_disadvantage_score'] = PCA(n_components=1).fit_transform(\n",
    "    df_clean[['total_income_inv', 'realestate_tax_inv']]\n",
    ")\n",
    "\n",
    "# Final clustering variables\n",
    "cluster_vars = [\n",
    "    'heat_stress_score',\n",
    "    'green_deficit_score',   \n",
    "    'economic_disadvantage_score',\n",
    "    \n",
    "]\n",
    "X_features = df_clean[cluster_vars].values\n",
    "\n",
    "# Best K from silhouette plot\n",
    "best_k = 4  # Adjust this based on your result\n",
    "kmeans = KMeans(n_clusters=best_k, random_state=0)\n",
    "df_clean['kmeans_cluster'] = kmeans.fit_predict(X_features)\n",
    "\n",
    "# Merge with shapefile and map\n",
    "gdf = gdf_base.merge(df_clean, left_on='SIG_KOR_NM', right_on='city and district')\n",
    "\n",
    "fig, ax = plt.subplots(1, 1, figsize=(12, 10))\n",
    "gdf.plot(column='kmeans_cluster', cmap='Set2', legend=False,  # 👈 no legend here\n",
    "         linewidth=0.3, edgecolor='black', ax=ax)\n",
    "ax.set_title(f\"K-Means++ Vulnerability Clusters (k={best_k})\", fontsize=16)\n",
    "ax.axis('off')\n",
    "plt.tight_layout()\n",
    "plt.savefig(\"kmeans_clusters_k5_nolegend.png\", dpi=600)  # optional save\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d2cb153-c7ae-4778-9ca3-a756696f627a",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "6651ddde-08ca-464e-bba4-07037caf2f40",
   "metadata": {},
   "source": [
    "# 3. SKATER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "de03839d-710e-47bd-8e9d-b372a3fe5a79",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ea3f79f-00c2-4bbd-bc6a-b8dbced251ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reverse and standardize relevant variables\n",
    "reverse_vars = ['average_income', 'realestate_tax', 'homeownership', 'EVI', 'NDVI', 'LAI']\n",
    "for var in reverse_vars:\n",
    "    df[var + '_inv'] = -1 * StandardScaler().fit_transform(df[[var]])\n",
    "\n",
    "# PCA for Heat Stress\n",
    "heat_vars = ['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']\n",
    "df['heat_stress_score'] = PCA(n_components=1).fit_transform(StandardScaler().fit_transform(df[heat_vars]))\n",
    "\n",
    "# PCA for Green Deficit\n",
    "green_vars = ['EVI_inv', 'NDVI_inv', 'LAI_inv']\n",
    "df['green_deficit_score'] = PCA(n_components=1).fit_transform(df[green_vars])\n",
    "\n",
    "# Drop rows with missing values for economic PCA\n",
    "econ_vars = ['average_income_inv', 'realestate_tax_inv', 'homeownership_inv']\n",
    "df_clean = df.dropna(subset=econ_vars)\n",
    "\n",
    "# PCA for Economic Disadvantage\n",
    "df_clean['economic_disadvantage_score'] = PCA(n_components=1).fit_transform(df_clean[econ_vars].values)\n",
    "\n",
    "# Final clustering variables\n",
    "cluster_vars = [\n",
    "    'heat_stress_score',\n",
    "    'green_deficit_score',\n",
    "    'economic_disadvantage_score',\n",
    "]\n",
    "\n",
    "# Load shapefile and merge\n",
    "gdf = gpd.read_file(r\"\\TL_SCCO_SIG.json\")\n",
    "gdf = gdf.merge(df_clean, left_on='SIG_KOR_NM', right_on='city and district')\n",
    "\n",
    "# Spatial weights and SKATER\n",
    "w = Queen.from_dataframe(gdf)\n",
    "model = Skater(gdf, w, attrs_name=cluster_vars, n_clusters=4)\n",
    "model.solve()\n",
    "gdf['vulnerability_cluster'] = model.labels_\n",
    "\n",
    "# Plot\n",
    "fig, ax = plt.subplots(1, 1, figsize=(12, 10))\n",
    "gdf.plot(column='vulnerability_cluster', cmap='Set3', legend=True,\n",
    "         linewidth=0.3, edgecolor='black', ax=ax)\n",
    "ax.set_title(\"Structural Vulnerability Clusters (SKATER)\", fontsize=16)\n",
    "ax.axis('off')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5f311bd1-591b-47a2-9b61-9a8d100bfa5f",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "8a36960c-39bb-40b8-a443-acd68359ad15",
   "metadata": {},
   "source": [
    "# 4. Cluster Profiles Based on Original Environmental and Economic Indicators "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d363dd91-3e77-43ac-a39b-42293507591d",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "958b313a-ca25-4e9d-816e-e51ff129b71d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate three subplots for each domain with value labels and increased font size\n",
    "fig2, axs = plt.subplots(1, 3, figsize=(20, 6))\n",
    "\n",
    "# Heat-related\n",
    "heat_cols = ['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']\n",
    "cluster_profile[heat_cols].rename(columns=legend_labels).plot(kind='bar', ax=axs[0], fontsize=12)\n",
    "axs[0].set_title(\"(a) Heat Stress\", fontsize=28, pad=15)\n",
    "axs[0].set_ylabel(\"Mean Value\", fontsize=20)\n",
    "axs[0].set_xlabel(\"Cluster\", fontsize=20)\n",
    "axs[0].tick_params(axis='x', labelsize=16, rotation=90)\n",
    "axs[0].legend(loc='best', fontsize=18)\n",
    "axs[0].grid(True, linestyle='--', alpha=0.5)\n",
    "for container in axs[0].containers:\n",
    "    axs[0].bar_label(container, fmt='%.1f', fontsize=17)\n",
    "\n",
    "# Green-related\n",
    "green_cols = ['EVI', 'NDVI', 'LAI']\n",
    "cluster_profile[green_cols].rename(columns=legend_labels).plot(kind='bar', ax=axs[1], fontsize=12)\n",
    "axs[1].set_title(\"(b) Green Deficit\", fontsize=28, pad=15)\n",
    "axs[1].set_ylabel(\"Mean Value\", fontsize=20)\n",
    "axs[1].set_xlabel(\"Cluster\", fontsize=20)\n",
    "axs[1].tick_params(axis='x', labelsize=16)\n",
    "axs[1].legend(loc='best', fontsize=18)\n",
    "axs[1].grid(True, linestyle='--', alpha=0.5)\n",
    "for container in axs[1].containers:\n",
    "    axs[1].bar_label(container, fmt='%.3f', fontsize=17)\n",
    "\n",
    "# Economic-related\n",
    "econ_cols = ['average_income', 'realestate_tax']\n",
    "cluster_profile[econ_cols].rename(columns=legend_labels).plot(kind='bar', ax=axs[2], fontsize=12)\n",
    "axs[2].set_title(\"(c) Economic Disadvantage\", fontsize=28, pad=15)\n",
    "axs[2].set_ylabel(\"Mean Value\", fontsize=20)\n",
    "axs[2].set_xlabel(\"Cluster\", fontsize=20)\n",
    "axs[2].tick_params(axis='x', labelsize=16)\n",
    "axs[2].legend(loc='best', fontsize=18)\n",
    "axs[2].grid(True, linestyle='--', alpha=0.5)\n",
    "for container in axs[2].containers:\n",
    "    axs[2].bar_label(container, fmt='%.1f', fontsize=17)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eb459310-8840-44c2-a043-887602ea0ce7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "7453408e-7cf5-4f51-a7fd-bed7d79922bd",
   "metadata": {},
   "source": [
    "# 5. XGBoost and SHAP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6863f9d5-6c62-47f6-83d1-62d1761561ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_excel(r\"E\\df.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7624f59f-f18b-4c1d-bbcd-f5b068b32740",
   "metadata": {},
   "outputs": [],
   "source": [
    "# STEP 1: Reverse-code and standardize variables\n",
    "reverse_vars = ['average_income', 'realestate_tax', 'EVI', 'NDVI', 'LAI']\n",
    "for var in reverse_vars:\n",
    "    df[var + '_inv'] = -1 * StandardScaler().fit_transform(df[[var]])\n",
    "\n",
    "# STEP 2: PCA for heat, green, economic\n",
    "df['heat_stress_score'] = PCA(n_components=1).fit_transform(\n",
    "    StandardScaler().fit_transform(df[['Heat_Index', 'Environment_Stress_Index', 'WBGT_JME']])\n",
    ")\n",
    "df['green_deficit_score'] = PCA(n_components=1).fit_transform(df[['EVI_inv', 'NDVI_inv', 'LAI_inv']])\n",
    "df_clean = df.dropna(subset=['average_income_inv', 'realestate_tax_inv', 'city and district'])\n",
    "df_clean['economic_disadvantage_score'] = PCA(n_components=1).fit_transform(\n",
    "    df_clean[['total_income_inv', 'realestate_tax_inv']]\n",
    ")\n",
    "\n",
    "# STEP 3: KMeans clustering\n",
    "cluster_vars = ['heat_stress_score', 'green_deficit_score', 'economic_disadvantage_score']\n",
    "X_kmeans = df_clean[cluster_vars].values\n",
    "kmeans = KMeans(n_clusters=4, random_state=0)\n",
    "df_clean['kmeans_cluster'] = kmeans.fit_predict(X_kmeans)\n",
    "\n",
    "# STEP 4: XGBoost modeling (use original variables as input)\n",
    "\n",
    "y = df_clean['kmeans_cluster']\n",
    "\n",
    "# Rename variables for publication\n",
    "X = X.rename(columns={\n",
    "    'Heat_Index': 'HI',\n",
    "    'Environment_Stress_Index': 'ESI',\n",
    "    'WBGT_JME': 'WBGT',\n",
    "    'EVI': 'EVI',\n",
    "    'NDVI': 'NDVI',\n",
    "    'LAI': 'LAI',\n",
    "    'average_income': 'Average income',\n",
    "    'realestate_tax': 'Real estate tax'\n",
    "})\n",
    "\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)\n",
    "\n",
    "model = xgb.XGBClassifier(objective='multi:softprob', num_class=4, eval_metric='mlogloss')\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "# STEP 5: Evaluate classification (optional)\n",
    "y_pred = model.predict(X_test)\n",
    "print(classification_report(y_test, y_pred))\n",
    "\n",
    "# STEP 6: SHAP\n",
    "explainer = shap.Explainer(model)\n",
    "shap_values = explainer(X_train)\n",
    "\n",
    "# STEP 7: SHAP global plots\n",
    "shap.summary_plot(shap_values, X_train, plot_type=\"bar\")\n",
    "#shap.summary_plot(shap_values, X_train)\n",
    "\n",
    "# STEP 8: Safe force plot (works for both multi-class and unified)\n",
    "shap.initjs()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b91f6aa-9901-4c58-bb8e-f2b767aed7a9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
